libname red "/.../F5500/microdata";
libname green "/.../F5500";
libname blue "/.../data";

%include "/.../programs/includes/format_source_2000.sas";
%include "/.../programs/includes/format_source_2001.sas";
%include "/.../programs/includes/format_source_2002.sas";
%include "/.../programs/includes/format_source_2004.sas";
%include "/.../programs/includes/format_source_2005.sas";
%include "/.../programs/includes/format_source_2014.sas";
%include "/.../programs/includes/pullplans_0008.sas"; 
%include "/.../programs/includes/pullplans_0914.sas"; 
%include "/.../programs/includes/pullschedb.sas"; 
%include "/.../programs/includes/read_schedb_foia.sas"; 
%include "/.../programs/includes/read_schedsb.sas"; 


/**************************************************************************
* Author : Dhiren Patki
*
* This program cleans form 5500 micro data
* 
* Between 2000 and 2009 F5500 input files are either of the 'research' 
* variety or of the 'raw' variety. The 'research files' are edited and 
* cleaned by DoL but contain only a sample of plans<100 participants 
* (small plans). The remanining small plans come from the 'raw' 
* files. These are not cleaned in any way other than de-duplication of 
* repeat filings for the same plan in the same form year.
* 
*
***************************************************************************/

/**************************************************************************
* Pull data from the main F5500 files. 
* Each call is one filing year of data: 1996-2014. 
***************************************************************************/

%include "/.../programs/includes/pullplans_96.sas"; 
%include "/.../programs/includes/pullplans_97.sas"; 
%include "/.../programs/includes/pullplans_98.sas"; 
%include "/.../programs/includes/pullplans_99.sas"; 
%pullplans_0008(2000,00);
%pullplans_0008(2001,01);
%pullplans_0008(2002,02);
%pullplans_0008(2003,03);
%pullplans_0008(2004,04);
%pullplans_0008(2005,05);
%pullplans_0008(2006,06);
%pullplans_0008(2007,07);
%pullplans_0008(2008,08);
%pullplans_09(2009,09);
%pullplans_1014(10);
%pullplans_1014(11);
%pullplans_1014(12);
%pullplans_1014(13);
%pullplans_1014(14);

/**************************************************************************
* Re-arrange the data from filing year to plan-year; attach sch H/I/SF
***************************************************************************/

/*Main form*/
data blue.pens1996 blue.pens1997 blue.pens1998 pens1999 pens2000 pens2001
     pens2002 pens2003 pens2004 pens2005 pens2006 pens2007 pens2008 pens2009 
     blue.pens2010 blue.pens2011 blue.pens2012 blue.pens2013 blue.pens2014;
     set blue.pensplan_samp96 blue.pensplan_samp97 blue.pensplan_samp98
     blue.pensplan_samp99 blue.pensplan_samp00 blue.pensplan_samp01
     blue.pensplan_samp02 blue.pensplan_samp03 blue.pensplan_samp04
     blue.pensplan_samp05 blue.pensplan_samp06 blue.pensplan_samp07
     blue.pensplan_samp08 blue.pensplan_samp09 blue.pensplan_samp10
     blue.pensplan_samp11 blue.pensplan_samp12 blue.pensplan_samp13
     blue.pensplan_samp14;

     /*Tag DB plans as frozen*/
     if /*redacted*/ then frozen=/*redacted*/;	/*plan is hard frozen*/
     else frozen=.;

    /*Label some variables*/
    label 
    adj_active = "Adjusted number of participants"
    adj_db_partcp = "Number of active DB plan participants (adjusted)"
    db = "DB plan indicator"
    db_partcp = "Number of active DB plan participants (unadjusted)"
    dc = "DC plan indicator"
    large = ">100 participants flag"
    form_year = "Form year/filing year"
    frozen = "Plan is frozen"
    cash_bal = "Plan is cash balance"
    num_DB_plans = "Number of DB plans associated with this EIN"
    plan_eff_year = "Year in which plan was born"
    plan_qtr_end = "Quarter in which plan calendar ends"
    plan_qtr_strt = "Quarter in which plan calendar starts"
    plan_year_end = "Year in which plan calendar ends"
    plan_year_strt = "Year in which plan calendar starts"
    source_id = "Source of F5500 record: /*redacted*/";

    if plan_year_end=1996 then output blue.pens1996;
    else if plan_year_end=1997 then output blue.pens1997;
    else if plan_year_end=1998 then output blue.pens1998;
    else if plan_year_end=1999 then output pens1999;
    else if plan_year_end=2000 then output pens2000;
    else if plan_year_end=2001 then output pens2001;
    else if plan_year_end=2002 then output pens2002;
    else if plan_year_end=2003 then output pens2003;
    else if plan_year_end=2004 then output pens2004;
    else if plan_year_end=2005 then output pens2005;
    else if plan_year_end=2006 then output pens2006;
    else if plan_year_end=2007 then output pens2007;
    else if plan_year_end=2008 then output pens2008;
    else if plan_year_end=2009 then output pens2009;
    else if plan_year_end=2010 then output blue.pens2010;
    else if plan_year_end=2011 then output blue.pens2011;
    else if plan_year_end=2012 then output blue.pens2012;
    else if plan_year_end=2013 then output blue.pens2013;
    else if plan_year_end=2014 then output blue.pens2014;
run;


proc datasets lib=blue nolist;
	delete pensplan_samp96 pensplan_samp97 pensplan_samp98 pensplan_samp99 pensplan_samp00 
	pensplan_samp01 pensplan_samp02 pensplan_samp03 pensplan_samp04 pensplan_samp05
        pensplan_samp06 pensplan_samp07 pensplan_samp08 pensplan_samp09 pensplan_samp10
	pensplan_samp11 pensplan_samp12 pensplan_samp13 pensplan_samp14;
quit;
run;

%macro fdeduper(yyyy);

  %if &yyyy. < 1999 OR &yyyy. > 2009 %then %do;

      proc sort data = blue.pens&yyyy nodupkey;
	  by opr_ein opr_pn filing_id_char;
      run;
  
  %end;

  %else %do;

      proc sort data = pens&yyyy nodupkey;
	  by opr_ein opr_pn filing_id_char;
      run;

  %end;

%mend;

%fdeduper(1996); %fdeduper(1997); %fdeduper(1998); %fdeduper(1999); %fdeduper(2000); %fdeduper(2001);
%fdeduper(2002); %fdeduper(2003); %fdeduper(2004); %fdeduper(2004); %fdeduper(2005); %fdeduper(2006);
%fdeduper(2007); %fdeduper(2008); %fdeduper(2009); %fdeduper(2010); %fdeduper(2011); %fdeduper(2012);
%fdeduper(2013); %fdeduper(2014); 

/* Contribution schedules (H,I,SF)
*  Note: Pre-1999 and post-2009 these data are included with the main form data pull*/

data s99 s00 s01 s02 s03 s04 s05 s06 s07 s08 s09;
     set blue.schhi1999 blue.schhi2000 blue.schhi2001 blue.schhi2002
     blue.schhi2003 blue.schhi2004 blue.schhi2005 blue.schhi2006
     blue.schhi2007 blue.schhi2008 blue.schhisf2009;
     if plan_year_end=1999 then output s99;
     else if plan_year_end=2000 then output s00;
     else if plan_year_end=2001 then output s01;
     else if plan_year_end=2002 then output s02;
     else if plan_year_end=2003 then output s03;
     else if plan_year_end=2004 then output s04;
     else if plan_year_end=2005 then output s05;
     else if plan_year_end=2006 then output s06;
     else if plan_year_end=2007 then output s07;
     else if plan_year_end=2008 then output s08;
     else if plan_year_end=2009 then output s09;
run;

proc datasets lib=blue nolist;
	delete schhi1999 schhi2000 schhi2001 schhi2002 schhi2003 schhi2004
        schhi2005 schhi2006 schhi2007 schhi2008 schhisf2009;
quit;
run;

/*Merge data from the main form with contribution schedules*/

%macro merger(yyyy,yy);

    data pens&yyyy;
	set pens&yyyy (drop = emplr_contrib_income_amt_n employee_contrib_income_amt_n tot_income_amt_n);
    run;

    proc sql;
	  create table blue.pens&yyyy as
	  select a.*, b.emplr_contrib_income_amt_n, b.employee_contrib_income_amt_n, b.tot_income_amt_n
	  from pens&yyyy as a left join 
	  s&yy as b on 
	  a.filing_id_char = b.filing_id_char;
    quit;

%mend;

%merger(1999,99); %merger(2000,00); %merger(2001,01); %merger(2002,02); %merger(2003,03);
%merger(2004,04); %merger(2005,05); %merger(2006,06); %merger(2007,07); %merger(2008,08);
%merger(2009,09);


/**************************************************************************
* Pull the raw schedule B data (1997-2007),(SB for 2009-14), 2008 missing
* Split by plan_end_year 
***************************************************************************/

/*Foia files (pre-1999)*/

/*1996 data*/
%read_schedb_foia(yy=96,Ind=C);
%read_schedb_foia(yy=96,Ind=F);
%read_schedb_foia(yy=96,Ind=R);

/*1997 data*/
%read_schedb_foia(yy=97,Ind=C);
%read_schedb_foia(yy=97,Ind=F);
%read_schedb_foia(yy=97,Ind=R);

/*1998 data*/
%read_schedb_foia(yy=98,Ind=C);
%read_schedb_foia(yy=98,Ind=F);
%read_schedb_foia(yy=98,Ind=R);

%macro clean_schedb_foia (yyyy, yy);

	data schb_&yyyy;
		set cschedb&yy
		fschedb&yy
		rschedb&yy;
	run;

	/*Do some re-naming and cleaning*/
	data blue.schb_&yyyy (keep= opr_ein opr_pn filing_id_char actrl_curr_value_ast_01_amt actrl_rpa94_info_curr_liab_amt 
	     actrl_liab_act_total_bnft_amt actrl_liab_rtd_total_bnft_amt actrl_invst_return_prcnt
	     actrl_cost_method_code actrl_rpa94_expt_incr_liab_amt actrl_weighted_rtm_age 
	     actrl_liab_act_partcp_cnt actrl_liab_rtd_partcp_cnt actrl_liab_term_partcp_cnt plan_year_end
	     rename = (actrl_curr_value_ast_01_amt = plan_assets
		       actrl_rpa94_info_curr_liab_amt = plan_liab
		       actrl_liab_act_total_bnft_amt = actives_liab
		       actrl_liab_rtd_total_bnft_amt = rtd_liab
		       actrl_invst_return_prcnt = asset_return
		       actrl_cost_method_code = cost_method_code
		       actrl_rpa94_expt_incr_liab_amt = xpct_liab_incr
		       actrl_weighted_rtm_age = plan_R_age
		       actrl_liab_act_partcp_cnt = actives_cnt
		       actrl_liab_rtd_partcp_cnt = rtd_cnt
		       actrl_liab_term_partcp_cnt = term_cnt));
		set schb_&yyyy;
		form_year=&yyyy;
		opr_ein = left(trim(ein));				/*rename for consistency with research file*/
		opr_pn = left(trim(plan_number));				/*rename for consistency with research file*/
		len_pn = length(opr_pn);					/*length of pn field*/
		if len_pn=/*redacted*/ then opr_pn = /*redacted*/;
		else if len_pn=/*redacted*/ then opr_pn = /*redacted*/;
		/*Redacted: removing invalid EIN/PNs*/
		filing_id = /*redacted*/;
		filing_id_char = dln;
		plan_year_end = /*redacted*/;
	run;

	/*Re-cast some variables from char to numeric*/

	data blue.schb_&yyyy (drop = plan_R_age actives_cnt rtd_cnt term_cnt
	      rename = (actives_cnt_n=actives_cnt rtd_cnt_n=rtd_cnt term_cnt_n=term_cnt plan_R_age_n = plan_R_age));
	      set blue.schb_&yyyy;      
	      plan_R_age_n=/*redacted*/;
	      actives_cnt_n=/*redacted*/;
	      rtd_cnt_n=/*redacted*/;
	      term_cnt_n=/*redacted*/;
	run;

%mend;

%clean_schedb_foia(1996,96);
%clean_schedb_foia(1997,97);
%clean_schedb_foia(1998,98);

/*Post-1999 files*/
%pullschedb(1999);
%pullschedb(2000);
%pullschedb(2001);
%pullschedb(2002);
%pullschedb(2003);
%pullschedb(2004);
%pullschedb(2005);
%pullschedb(2006);
%pullschedb(2007);
%schedsb0910(09);
%schedsb0910(10);
%schedsb1114(11);
%schedsb1114(12);
%schedsb1114(13);
%schedsb1114(14);


/*Recast some variables*/
data blue.schb_1999 (drop=actives_cnt rtd_cnt term_cnt
     rename = (actives_cnt_n=actives_cnt rtd_cnt_n=rtd_cnt term_cnt_n=term_cnt));
     set blue.schb_1999;
     actives_cnt_n = /*redacted*/;
     rtd_cnt_n = /*redacted*/;
     term_cnt_n = /*redacted*/;
run;

data blue.schb_2000 (drop=actives_cnt rtd_cnt term_cnt
     rename = (actives_cnt_n=actives_cnt rtd_cnt_n=rtd_cnt term_cnt_n=term_cnt));
     set blue.schb_2000;
     actives_cnt_n = /*redacted*/);
     rtd_cnt_n = /*redacted*/;
     term_cnt_n = /*redacted*/;
run;

/*Split the actuarial data by plan_year_end*/
data blue.schb_p1996 blue.schb_p1997 blue.schb_p1998 blue.schb_p1999
     blue.schb_p2000 blue.schb_p2001 blue.schb_p2002 blue.schb_p2003
     blue.schb_p2004 blue.schb_p2005 blue.schb_p2006 blue.schb_p2007
     blue.schb_p2008 blue.schsb_p2009 blue.schsb_p2010 blue.schsb_p2011
     blue.schsb_p2012 blue.schsb_p2013 blue.schsb_p2014;
     set blue.schb_1996 blue.schb_1997 blue.schb_1998 blue.schb_1999
     blue.schb_2000 blue.schb_2001 blue.schb_2002 blue.schb_2003
     blue.schb_2004 blue.schb_2005 blue.schb_2006 blue.schb_2007
     blue.schsb_2009 blue.schsb_2010 blue.schsb_2011
     blue.schsb_2012 blue.schsb_2013 blue.schsb_2014;
     if plan_year_end=1996 then output blue.schb_p1996;
     else if plan_year_end=1997 then output blue.schb_p1997;
     else if plan_year_end=1998 then output blue.schb_p1998;
     else if plan_year_end=1999 then output blue.schb_p1999;
     else if plan_year_end=2000 then output blue.schb_p2000;
     else if plan_year_end=2001 then output blue.schb_p2001;
     else if plan_year_end=2002 then output blue.schb_p2002;
     else if plan_year_end=2003 then output blue.schb_p2003;
     else if plan_year_end=2004 then output blue.schb_p2004;
     else if plan_year_end=2005 then output blue.schb_p2005;
     else if plan_year_end=2006 then output blue.schb_p2006;
     else if plan_year_end=2007 then output blue.schb_p2007;
     else if plan_year_end=2008 then output blue.schb_p2008;
     else if plan_year_end=2009 then output blue.schsb_p2009;
     else if plan_year_end=2010 then output blue.schsb_p2010;
     else if plan_year_end=2011 then output blue.schsb_p2011;
     else if plan_year_end=2012 then output blue.schsb_p2012;
     else if plan_year_end=2013 then output blue.schsb_p2013;
     else if plan_year_end=2014 then output blue.schsb_p2014;
run;


proc datasets lib=work nolist kill;
quit;
run;

proc datasets lib=blue nolist;
	delete schb_1996 schb_1997 schb_1998 schb_1999 schb_2000 schb_2001
	schb_2002 schb_2003 schb_2004 schb_2005 schb_2006 schb_2007 
	schsb_2009 schsb_2010 schsb_2011 schsb_2012 schsb_2013 schsb_2014;
quit;
run;

